In [1]:
import json, os, glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
from matplotlib import rc
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import normalize, StandardScaler, minmax_scale, scale, maxabs_scale
from sklearn.metrics import classification_report, accuracy_score, f1_score, roc_curve
from sklearn.metrics import recall_score, precision_score, confusion_matrix, roc_auc_score
from tqdm.auto import tqdm as tn
import joblib
import sys
import json
import time
sys.path.append('../')
import pulicFuncs as pf
import seaborn as sns
import warnings

warnings.filterwarnings('ignore')
plt.style.use('fivethirtyeight')
rc('mathtext', default='regular')

mpl.font_manager.fontManager.addfont(r"./wqy-microhei.ttc")
# mpl.font_manager.fontManager.ttflist 获得临时注册字体的名字
mpl.rcParams['axes.unicode_minus'] = False  #负号正常显示
mpl.rcParams['font.sans-serif'] = ['WenQuanYi Micro Hei']

myfont = mpl.font_manager.FontProperties(
    fname=r"wqy-microhei.ttc")

pd.set_option('display.max_columns', None)  # pandas 完全显示列
In [2]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegressionCV, Lasso, LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier, CatBoostRegressor
from xgboost import XGBClassifier


def draw_feature_importance(df, importance_name='fscore', top_N=10):
    nn = df.shape[0]
    df.plot(kind='bar', x='features', y=importance_name,
            legend=False, figsize=(nn * 0.3, 10))
    plt.vlines(x=top_N + 0.5, ymin=0,
               ymax=df[importance_name][0], linestyles='--', color='black')
    plt.text(top_N, df[importance_name][0] - 0.002,
             f'TOP{top_N}', ha='right')
    plt.title('XGBoost Feature Importance', fontproperties=myfont)
    plt.xticks(ticks=range(nn), labels=df['features'].tolist(),
               rotation=90, fontproperties=myfont)
    plt.xlabel('features', fontproperties=myfont)
    plt.ylabel('relative importance', fontproperties=myfont)
    plt.show()

加载数据集¶

In [3]:
all_data = pd.read_csv('./data/20221027数据/1819训练数据.csv')
all_testSet = pd.read_csv('./data/20221027数据/20年测试数据.csv')
In [4]:
all_data.shape
Out[4]:
(3463, 413)
In [5]:
all_data = all_data.fillna(-9999999)
all_testSet = all_testSet.fillna(-9999999)
In [6]:
def ww_get_value_counts_and_rate(df, colname):
    """
    获得对应列的所有值个数与相应占比
    """
    tmp = pd.DataFrame()
    tmp[f'{colname}_num'] = df[colname].value_counts()
    tmp[f'{colname}_rate'] = df[colname].value_counts() / df.shape[0]
    print(tmp)
In [7]:
ww_get_value_counts_and_rate(all_data, 'target')  # 统计一下标签每个取值的数量
   target_num  target_rate
0        2506      0.72365
1         957      0.27635
In [8]:
df = all_data[all_data['target'] != -9999999].copy()
print(df.shape)
target_dist = df['target'].value_counts().sort_index()
good_bad_rate = target_dist[0] / target_dist[1]  # 0是好客户,1是坏客户
print(f'好坏比:{good_bad_rate}')
(3463, 413)
好坏比:2.618599791013584

建模¶

切分出测试集¶

In [9]:
# index = list(range(len(df)))  # 1~3182,对于行号
# np.random.shuffle(index)
# index_df = pd.DataFrame(index, columns=['index'])  # 生成一个df,只有一列,是乱序的行号
# index_df.to_csv(f'./20221027数据/index_ALL.csv', index=False)  # index=False 不会把df的行index写入csv
In [10]:
index_df = pd.read_csv(f'./data/20221027数据/index_ALL.csv')  # 打乱的顺序保存在csv后再读取,便于复现
index = index_df['index'].values.tolist()  # index 是行索引乱序列表

data_df = df.iloc[index[:], :].copy()  # 获得行乱序的训练数据
testSet = all_testSet.copy()  # 测试数据不打乱
In [11]:
data_df.shape, testSet.shape
Out[11]:
((3463, 413), (2792, 413))
In [12]:
ww_get_value_counts_and_rate(data_df, 'target')
ww_get_value_counts_and_rate(testSet, 'target')
   target_num  target_rate
0        2506      0.72365
1         957      0.27635
   target_num  target_rate
0        2155     0.771848
1         637     0.228152

特征缺失率统计¶

In [13]:
# 业务关注的特征列表
care = ['区域经济_生产总值(GDP)(元)', '区域经济_生产总值同比增长(%)', '区域财政_财政收入(元)',
        '区域财政_一般预算收入增长', '区域财政_财政自给率(%)', '区域财政_一般预算收入占比(%)',
        '区域财政_基金收入占比(%)', '区域财政_政府负债率(%)', '区域财政_地方政府债务率(%)',
        '区域利差_1年期AAA', '区域利差_1年期AA+', '区域利差_1年期AA', '区域利差_1年期不限评级',
        '区域利差_2年期AAA', '区域利差_2年期AA+', '区域利差_2年期AA', '区域利差_2年期不限评级',
        '区域利差_3年期AAA', '区域利差_3年期AA+', '区域利差_3年期AA', '区域利差_3年期不限评级',
        '区域利差_4年期AAA', '区域利差_4年期AA+', '区域利差_4年期AA', '区域利差_4年期不限评级',
        '区域贷款_区域银行贷款余额(亿元)']
In [14]:
null_rate = 0.85
null_rate_df = pf.get_null_rate_df(data_df.iloc[:, 8:].replace(-9999999, np.nan))
Nnll_features = null_rate_df[null_rate_df['null_rate'] < null_rate]['features'].tolist()  # 获得小于指定缺失值的特征列表
null_ratio = len(Nnll_features) / len(df.columns[8:])
print(len(Nnll_features), null_ratio)
383 0.945679012345679
In [15]:
null_rate_df[null_rate_df['features'].isin(care)]
Out[15]:
features null_rate
27 区域利差_3年期AAA 0.696795
30 区域利差_2年期AAA 0.621138
31 区域利差_1年期AAA 0.616806
33 区域利差_4年期AAA 0.581288
34 区域利差_3年期AA 0.557031
51 区域利差_3年期AA+ 0.417846
58 区域利差_2年期AA+ 0.350563
59 区域利差_1年期AA+ 0.341611
67 区域利差_2年期AA 0.324285
68 区域利差_1年期AA 0.320243
74 区域利差_4年期AA+ 0.274617
75 区域利差_3年期不限评级 0.272596
95 区域利差_2年期不限评级 0.199249
96 区域利差_4年期AA 0.199249
97 区域利差_1年期不限评级 0.193474
133 区域利差_4年期不限评级 0.112619
363 区域财政_基金收入占比(%) 0.033786
370 区域财政_一般预算收入增长 0.020791
372 区域贷款_区域银行贷款余额(亿元) 0.012995
386 区域财政_地方政府债务率(%) 0.000866
390 区域财政_政府负债率(%) 0.000866
392 区域经济_生产总值同比增长(%) 0.000289
393 区域经济_生产总值(GDP)(元) 0.000000
394 区域财政_一般预算收入占比(%) 0.000000
397 区域财政_财政自给率(%) 0.000000
402 区域财政_财政收入(元) 0.000000
In [16]:
pf.draw_null_rate_distribution(
    data_df.iloc[:, 8:], axis=0, ascending=False, n_top=20, null_rate=null_rate)
# 缺失率小于85%的特征的占比为95%,绘制了缺失率前20的特征

缺失率筛选特征¶

In [17]:
## 每个特征按照年度进行缺失率统计
N = 8
null_rate = 0.5

tmp = pd.concat([data_df, testSet], axis=0)
tmp.iloc[:, N:] = tmp.iloc[:, N:].replace(-9999999, np.nan).isnull()  # 对8个后面的特征字段,转换为是否为空,便于统计
tmp_null_df = tmp.groupby(['year']).sum() / tmp.groupby(['year']).count()  # sum 对于 false 为0
feature_null_df = pd.DataFrame(
    (tmp_null_df.iloc[:, N:] < null_rate).sum(), columns=['符合要求的年数'])  # 统计出每个特征,三年内每年小于指定缺失率的次数
Nnll_features = feature_null_df[feature_null_df['符合要求的年数'] == 3].index.tolist()
print(len(Nnll_features))
352
In [18]:
set(care) - set(Nnll_features)  # 比较关注的特征中,不满足三年缺失率的特征
Out[18]:
{'区域利差_1年期AAA', '区域利差_2年期AAA', '区域利差_3年期AA', '区域利差_3年期AAA', '区域利差_4年期AAA'}

训练数据切分¶

In [19]:
value_features = data_df.columns[8:].tolist()

X_training, X_validation, y_training, y_validation = train_test_split(
    data_df.loc[:, value_features],
    data_df.loc[:, 'target'], 
    test_size=0.3,
    random_state=0,
    stratify=data_df.loc[:, 'target']  # 控制训练集和验证集中target的分布
)
X_testing = testSet.loc[:, value_features].copy()
y_testing = testSet['target']
In [20]:
X_training.shape[0], X_validation.shape[0], X_testing.shape[0]
Out[20]:
(2424, 1039, 2792)
In [21]:
print(y_training.value_counts())
print(y_validation.value_counts())  # 很符合之前的分布
0    1754
1     670
Name: target, dtype: int64
0    752
1    287
Name: target, dtype: int64

进一步筛选特征¶

通过iv¶

In [22]:
tmp = pd.concat([data_df, testSet], axis=0)
iv_df, woe_df = pf.iv_woe(
    tmp[['target'] + care],  # 查看业务老师关注的特征的iv与woe
    'target', bins=20
)
iv_woe:   0%|          | 0/26 [00:00<?, ?it/s]
In [23]:
iv_df
Out[23]:
features iv
0 区域财政_财政收入(元) 1.231433
0 区域财政_财政自给率(%) 1.054307
0 区域经济_生产总值(GDP)(元) 1.052078
0 区域利差_4年期不限评级 1.017426
0 区域贷款_区域银行贷款余额(亿元) 0.909544
0 区域利差_2年期不限评级 0.746671
0 区域利差_1年期不限评级 0.724916
0 区域利差_4年期AA+ 0.662673
0 区域财政_一般预算收入增长 0.659968
0 区域利差_3年期不限评级 0.655208
0 区域利差_4年期AAA 0.615490
0 区域利差_4年期AA 0.597348
0 区域利差_3年期AA+ 0.591655
0 区域财政_一般预算收入占比(%) 0.567629
0 区域利差_3年期AAA 0.563559
0 区域财政_基金收入占比(%) 0.537443
0 区域利差_1年期AA+ 0.528846
0 区域利差_1年期AA 0.518764
0 区域财政_政府负债率(%) 0.512509
0 区域利差_2年期AAA 0.510014
0 区域利差_2年期AA+ 0.502329
0 区域利差_1年期AAA 0.444882
0 区域利差_2年期AA 0.437265
0 区域财政_地方政府债务率(%) 0.359108
0 区域利差_3年期AA 0.358700
0 区域经济_生产总值同比增长(%) 0.206396
In [24]:
woe_df
Out[24]:
features Cutoff N Events % of Events Non_Events % of Non_Events woe iv
0 区域财政_财政收入(元) (-9999999.001, 28085150000.0] 325 147 0.092221 178 0.038189 0.881617 0.047636
1 区域财政_财政收入(元) (28085150000.0, 36280690000.0] 302 170 0.106650 132 0.028320 1.325954 0.103863
2 区域财政_财政收入(元) (36280690000.0, 43923306000.0] 312 149 0.093476 163 0.034971 0.983162 0.057520
3 区域财政_财政收入(元) (43923306000.0, 54301870000.0] 324 142 0.089084 182 0.039047 0.824789 0.041271
4 区域财政_财政收入(元) (54301870000.0, 61263140000.0] 306 135 0.084693 171 0.036687 0.836579 0.040161
... ... ... ... ... ... ... ... ... ...
412 区域经济_生产总值同比增长(%) (7.4, 7.7] 296 95 0.059598 201 0.043124 0.323549 0.005331
413 区域经济_生产总值同比增长(%) (7.7, 7.9] 337 110 0.069009 227 0.048702 0.348508 0.007077
414 区域经济_生产总值同比增长(%) (7.9, 8.1] 371 114 0.071518 257 0.055138 0.260102 0.004261
415 区域经济_生产总值同比增长(%) (8.1, 8.5] 243 77 0.048306 166 0.035615 0.304794 0.003869
416 区域经济_生产总值同比增长(%) (8.5, 12.0] 297 142 0.089084 155 0.033255 0.985367 0.055013

417 rows × 9 columns

In [25]:
tmp = pd.concat([data_df, testSet], axis=0)
iv_df, woe_df = pf.iv_woe(
    tmp[['target'] + Nnll_features],  # Nnll_features 是通过每年缺失率选出来的
    'target', bins=20
)
iv_woe:   0%|          | 0/352 [00:00<?, ?it/s]
In [26]:
selected_features = iv_df[iv_df['iv']>0.1]['features'].tolist()
len(selected_features)
Out[26]:
208

模型给出的特征重要性¶

In [27]:
params = {
    'learning_rate': 0.1,
    'random_state': 47, 
    'max_depth': 3,
    'min_child_weight': 20, 
    'subsample': 0.7,
    'colsample_bytree': 0.7,
    'gamma': 0,
    'objective': 'binary:logistic',
    'scale_pos_weight': good_bad_rate,
    'nthread': 14,
    'silent': 1,
}
tmp = pd.concat([data_df, testSet], axis=0)
feature_importance_df = pf.get_feature_importance_from_xgboost(
    # 用Xgboost进行特征筛选,并从xgboost获取feature importance
    X_training[Nnll_features],  # 训练集只取部分
    y_training, 
    rounds=300,  # 迭代轮数
    params=params  # XGB的参数
)
[13:37:36] WARNING: ../src/learner.cc:576: 
Parameters: { "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[13:37:36] WARNING: ../src/learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
('feature number:', 316)
In [28]:
feature_importance_df
Out[28]:
features fscore
0 资产规模数据_资产总计 0.015593
1 财报衍生_其他应收款/总资产 0.015593
2 成长能力_营业利润(N年,增长率) 0.011525
3 区域利差_4年期AA 0.011525
4 成长能力_每股经营活动产生的现金流量净额(同比增长率) 0.010847
... ... ...
311 盈利能力_财务费用/营业总收入(TTM) 0.000678
312 盈利能力_营业总成本/营业总收入(TTM) 0.000678
313 盈利能力_成本费用利润率 0.000678
314 盈利能力_息税前利润/营业总收入 0.000678
315 偿债能力_产权比率 0.000678

316 rows × 2 columns

In [29]:
# 先用特征重要性筛特征,再用iv筛特征
# importance_features = feature_importance_df['features'].tolist()[:N_features]
# iv_df, woe_df = pf.iv_woe(
#     data_df[['target']+importance_features],
#     'target', bins=20
# )
# iv_df = iv_df.rename(columns={'iv': 'IV'})
# iv_df['feature_class'] = iv_df['features'].map(lambda x: x.split('_')[0])
# iv_df.index = iv_df['features'].tolist()
# iv_df[:50]
In [30]:
draw_feature_importance(feature_importance_df[:60], 'fscore', 30)
In [31]:
selected_features = feature_importance_df['features'].tolist()[:60]

相关性¶

In [32]:
# 查看选出特征的相关性
uncorr_features = pf.get_uncorr_features(
    data_df[['target']+selected_features],
    corr_rate=0.95, target_name='target')
print(len(uncorr_features))
40
In [33]:
# pf.draw_corr_heatmap(data_df[['target']+uncorr_features], target_name='target')
In [34]:
# 通过相关性进一步筛选特征
uncorr_feature_df = feature_importance_df[
    feature_importance_df['features'].isin(uncorr_features)
].sort_values('fscore', ascending=False)
selected_features = uncorr_feature_df['features'].tolist()[:30]
selected_features
Out[34]:
['资产规模数据_资产总计',
 '财报衍生_其他应收款/总资产',
 '成长能力_营业利润(N年,增长率)',
 '区域利差_4年期AA',
 '成长能力_每股经营活动产生的现金流量净额(同比增长率)',
 '区域财政_政府负债率(%)',
 '营运能力_固定资产周转率',
 '盈利能力_EBITDA/营业总收入',
 '区域利差_4年期不限评级',
 '区域财政_税收返还收入(元)',
 '区域利差_2年期不限评级',
 '资本结构_长期资产适合率',
 '营运能力_应收账款周转天数',
 '偿债能力_货币资金/短期债务',
 '区域经济_人均GDP(元)',
 '财报衍生_净利润',
 '财报衍生_流动负债增长率',
 '财报衍生_有形净资产',
 '财报衍生_在建工程',
 '财报衍生_固定资产',
 '现金流量_投资活动产生的现金流量净额占比',
 '区域利差_2年期AA',
 '财报衍生_净资产(即所有者权益)',
 '区域财政_专项债券付息支出(元)',
 '成长能力_价值变动净收益(N年,增长率)',
 '财报衍生_预收帐款周转天数',
 '财报衍生_资金回笼率',
 '财报衍生_有形总资产',
 '区域财政_存续城投债务规模/一般预算收入(%)',
 '区域财政_国有资本经营收入(元)']

手动调整入模特征¶

In [35]:
# 基于业务需要,指定了入模特征
last_selected_features = ['资产规模数据_资产总计', '成交_成交土地均价(元/㎡)',
                          '财报衍生_净资产(即所有者权益)', '区域财政_政府负债率(%)',
                          '区域财政_财政自给率(%)', '财报衍生_流动资产',
                          '区域财政_存续城投债务规模/一般预算收入(%)', '财报衍生_固定资产',
                          '偿债能力_货币资金/短期债务', '财报衍生_净利润',
                          '区域经济_生产总值(GDP)(元)', 
                          '区域贷款_区域银行贷款余额(亿元)',
                          '区域财政_一般预算收入占比(%)', 
                          '区域利差_1年期不限评级',
                          '区域财政_地方政府债务率(%)', '区域财政_一般预算收入增长',
                          '偿债能力_现金比率', '财报衍生_其他应付款/流动负债',
                          #'财报衍生_资金回笼率',   # 21年数据没这个特征
                          '每股指标_每股现金流量净额',
                          '区域财政_基金收入占比(%)', '财报衍生_其他应收款/总资产',
                          '偿债能力_有形净值债务率', '财报衍生_无形资产',
                          #'财报衍生_流动负债增长率', 
                          '财报衍生_存货周转天数',
                          '区域经济_生产总值同比增长(%)', '财报衍生_其他应付款/流动资产',
                          '成长能力_净利润复合年增长率', '区域财政_财政收入(元)']
In [36]:
# 由于21年数据有两个特征没有,于是剔除,从备选列表中找两个特征补充
# tmp = list(set(importance_features[:(30)]) - set(last_selected_features))
# beixuan = ['财报衍生_预付账款周转天数',
# '财报衍生_营业利润+管理费用+财务费用', '财报衍生_应收账款周转率', '财报衍生_净资产同比增长率',
# '财报衍生_净资产(即所有者权益)', '财报衍生_其他应收款/总资产', '财报衍生_其他应付款/流动资产',
# '财报衍生_其他应付款/流动负债', '财报衍生_两金占流动资产比重', '财报衍生_流动资产', '财报衍生_固定资产',
# '财报衍生_净利润', '资产规模数据_资产总计', '盈利能力_资产减值损失/营业总收入(TTM)', '偿债能力_货币资金/短期债务',
# '成长能力_营业收入(同比增长率)', '成长能力_营业利润(N年,增长率)', '每股指标_每股现金流量净额',
# '成长能力_每股净资产(相对年初增长率)', '成长能力_归属母公司股东的权益(相对年初增长率)', '营运能力_应收账款周转天数',
# '营运能力_应付账款周转天数',
# '偿债能力_现金比率',
# '财报衍生_存货周转天数',
# '成长能力_净利润复合年增长率',
# '偿债能力_有形净值债务率',
# '财报衍生_无形资产'
# '成交_土地宗数(宗)', '成交_土地出让金(万元)', '成交_建设用地面积(万㎡)',
# '成交_规划建筑面积(万㎡)', '成交_平均溢价率(%)', '成交_成交楼面均价(元/㎡)', '成交_成交土地均价(元/㎡)',
# '流拍_土地宗数(宗)', '流拍_建设用地面积(万㎡)', '流拍_规划建筑面积(万㎡)', '流拍_起始价(万元)',
# '流拍_流拍楼面均价(元/㎡)', '流拍_流拍土地均价(元/㎡)', '流拍率(流拍/成交)', '流拍率(流拍/(成交+流拍))']
# [i for i in tmp if i in beixuan]
In [37]:
selected_features = last_selected_features + ['成长能力_营业利润(N年,增长率)', '成长能力_归属母公司股东的权益(相对年初增长率)']
len(selected_features)
Out[37]:
30

模型训练¶

训练并调参¶

In [38]:
print(X_training.shape, X_validation.shape, X_testing.shape)
print(y_training.shape, y_validation.shape, y_testing.shape)
(2424, 405) (1039, 405) (2792, 405)
(2424,) (1039,) (2792,)
In [40]:
X_training_selected = X_training[selected_features]
X_validation_selected = X_validation[selected_features]

# model = XGBClassifier(
#     learning_rate=0.01,
#     random_state=47, 
#     n_estimators=500, 
#     max_depth=4, 
#     min_child_weight=20, 
#     subsample=0.8,
#     colsample_bytree=0.75,
#     gamma=0,
#     objective='binary:logistic',
#     nthread=14,
#     scale_pos_weight=good_bad_rate,
# )

# model = RandomForestClassifier(
#     n_estimators=100,
#     max_depth=4,
#     class_weight='balanced',
#     random_state=47
# )

model = CatBoostClassifier(
    iterations=300,
    depth=4,
    learning_rate=0.01,
    l2_leaf_reg=3,
#     loss_function='CrossEntropy',
    verbose=False,
    scale_pos_weight=good_bad_rate,
    random_seed = 0,
)

# model = CatBoostClassifier(
#     iterations=70,
#     depth=4,
#     learning_rate=0.08,
#     l2_leaf_reg=3.0,
# #     loss_function='CrossEntropy',
#     verbose=False,
#     scale_pos_weight=good_bad_rate,
#     random_seed=0,
# )

model.fit(X_training_selected, y_training)
Out[40]:
<catboost.core.CatBoostClassifier at 0x7f122d6bf890>
In [41]:
# 计算模型分
input_data={
    'Training': {
        'X'  : X_training_selected,
        'y'  : y_training,
    },
    'Validation': {
        'X'  : X_validation_selected,
        'y'  : y_validation,
    },
    'Testing': {
        'X'  : X_testing[selected_features],
        'y'  : y_testing,
    },
}

score_df = pf.print_model_scores(model, input_data, is_print=False)
score_df
Out[41]:
Training Validation Testing Validation - Training Testing - Training Testing - Validation
Accuracy 0.776815 0.769971 0.746777 -0.006844 -0.030039 -0.023195
Precision 0.565615 0.558252 0.467652 -0.007363 -0.097963 -0.090600
Recall 0.829851 0.801394 0.794349 -0.028457 -0.035502 -0.007045
F1 0.672716 0.658083 0.588714 -0.014633 -0.084002 -0.069369
AUC 0.881921 0.849454 0.837653 -0.032467 -0.044268 -0.011801
KS 0.589898 0.571795 0.531200 -0.018103 -0.058698 -0.040594
In [42]:
# pkl_path = './20221027数据/GBDT_城投中债隐含评级.pkl'
# model = joblib.load(pkl_path)
# # model.get_all_params()
In [43]:
# 使用测试集
X_merged = X_testing[selected_features]
y_merged = y_testing

target0 = model.predict_proba(X_merged[y_merged == 0])  # 0是好客户,1是坏客户
target1 = model.predict_proba(X_merged[y_merged == 1])
In [44]:
all_probs = {
    'good': target0[:, 1], # 取预测概率的第1列,即预测为1的概率,即坏客户的概率
    'bad': target1[:, 1],
}
# 所以理论上,概率越大,就越多坏客户
ddf = pf.draw_proba_distribution(all_probs, step=0.1, is_return=True, fig_save=False)
# ddf.to_excel('./ddf.xlsx')
In [45]:
ddf  # 等宽
Out[45]:
good bad ALL rate_good rate_bad
0.0-0.1 296 0 296 1.000000 0.000000
0.1-0.2 406 6 412 0.985437 0.014563
0.2-0.3 356 27 383 0.929504 0.070496
0.3-0.4 291 54 345 0.843478 0.156522
0.4-0.5 230 44 274 0.839416 0.160584
0.5-0.6 188 76 264 0.712121 0.287879
0.6-0.7 174 126 300 0.580000 0.420000
0.7-0.8 140 169 309 0.453074 0.546926
0.8-0.9 73 130 203 0.359606 0.640394
0.9-1.0 1 5 6 0.166667 0.833333
In [46]:
## 通过观察排序性和模型分调参,网格搜索
# search_params = {
#     'iterations': range(400, 1001, 200),
#     'depth': [4],
#     'learning_rate': [0.1, 0.05, 0.01, 0.02],
#     'l2_leaf_reg': [0, 7, 10, 20, 50, 100],
#     'scale_pos_weight': [good_bad_rate]
# }

# times = 1
# for k, v in search_params.items():
#     times *= len(v)
# print(times)

# model = CatBoostClassifier(
#     iterations=1000,
#     depth=4,
#     learning_rate=0.01,
#     l2_leaf_reg=7,
#     verbose=False,
#     scale_pos_weight=good_bad_rate,
# )

# model = pf.k_fold_model_with_param_search(
#     model=model,
#     X=X_training_selected,
#     y=y_training,
#     search_params=search_params,
#     score_fnc='auc',
#     K=5,
#     verbose=1
# )

特征重要性分析¶

In [47]:
model.feature_importances_
Out[47]:
array([17.59289737, 18.43422405,  6.18989722,  7.72593761,  4.78080144,
        2.21646645,  1.51962489,  3.14235894,  1.57769111,  2.22454699,
        1.61439207,  2.9287834 ,  2.8568244 ,  2.87556112,  3.53823908,
        0.97630477,  2.02346584,  2.59547113,  0.86194183,  1.77709818,
        4.87046848,  1.30881883,  0.75369305,  0.57469032,  0.33424009,
        1.04489914,  0.54679083,  1.43597084,  0.49217855,  1.18572198])
In [48]:
# 将重要性转换为百分比
model_feature_importance = pd.DataFrame()
model_feature_importance['importance'] = model.feature_importances_ / sum(
    model.feature_importances_)
model_feature_importance['features'] = selected_features
model_feature_importance.sort_values('importance', ascending=False, inplace=True)
model_feature_importance.index = range(model_feature_importance.shape[0])

draw_feature_importance(model_feature_importance, 'importance', 10)
In [49]:
model_feature_importance
Out[49]:
importance features
0 0.184342 成交_成交土地均价(元/㎡)
1 0.175929 资产规模数据_资产总计
2 0.077259 区域财政_政府负债率(%)
3 0.061899 财报衍生_净资产(即所有者权益)
4 0.048705 财报衍生_其他应收款/总资产
5 0.047808 区域财政_财政自给率(%)
6 0.035382 区域财政_地方政府债务率(%)
7 0.031424 财报衍生_固定资产
8 0.029288 区域贷款_区域银行贷款余额(亿元)
9 0.028756 区域利差_1年期不限评级
10 0.028568 区域财政_一般预算收入占比(%)
11 0.025955 财报衍生_其他应付款/流动负债
12 0.022245 财报衍生_净利润
13 0.022165 财报衍生_流动资产
14 0.020235 偿债能力_现金比率
15 0.017771 区域财政_基金收入占比(%)
16 0.016144 区域经济_生产总值(GDP)(元)
17 0.015777 偿债能力_货币资金/短期债务
18 0.015196 区域财政_存续城投债务规模/一般预算收入(%)
19 0.014360 区域财政_财政收入(元)
20 0.013088 偿债能力_有形净值债务率
21 0.011857 成长能力_归属母公司股东的权益(相对年初增长率)
22 0.010449 财报衍生_其他应付款/流动资产
23 0.009763 区域财政_一般预算收入增长
24 0.008619 每股指标_每股现金流量净额
25 0.007537 财报衍生_无形资产
26 0.005747 财报衍生_存货周转天数
27 0.005468 成长能力_净利润复合年增长率
28 0.004922 成长能力_营业利润(N年,增长率)
29 0.003342 区域经济_生产总值同比增长(%)
In [50]:
# model_feature_importance.to_csv('./models/v1/model_feature_importance.csv', index=False)

排序性¶

等宽排序性¶

In [51]:
X_merged = X_testing[selected_features]
y_merged = y_testing

target0 = model.predict_proba(X_merged[y_merged == 0])
target1 = model.predict_proba(X_merged[y_merged == 1])

all_probs = {
    'good': target0[:, 1], # 取预测概率的第1列,即预测为1的概率,即坏客户的概率
    'bad': target1[:, 1],
}
# 所以理论上,概率越大,就越多坏客户
ddf = pf.draw_proba_distribution(all_probs, step=0.05, is_return=True, fig_save=False)

# ddf
# ddf.to_excel('./ddf.xlsx')
In [52]:
row=1
target0[:, row].min(), target1[:, row].min(), target0[:, row].max(), target1[:, row].max()
Out[52]:
(0.021938861706956424,
 0.13981124735876804,
 0.9049597121693593,
 0.9059573864639114)
In [53]:
y_testing.value_counts()
Out[53]:
0    2155
1     637
Name: target, dtype: int64
In [54]:
ddf
Out[54]:
good bad ALL rate_good rate_bad
0.0-0.05 97 0 97 1.000000 0.000000
0.05-0.1 199 0 199 1.000000 0.000000
0.1-0.15 235 2 237 0.991561 0.008439
0.15-0.2 171 4 175 0.977143 0.022857
0.2-0.25 184 9 193 0.953368 0.046632
0.25-0.3 172 18 190 0.905263 0.094737
0.3-0.35 164 19 183 0.896175 0.103825
0.35-0.4 127 35 162 0.783951 0.216049
0.4-0.45 109 21 130 0.838462 0.161538
0.45-0.5 121 23 144 0.840278 0.159722
0.5-0.55 99 29 128 0.773438 0.226562
0.55-0.6 89 47 136 0.654412 0.345588
0.6-0.65 84 52 136 0.617647 0.382353
0.65-0.7 90 74 164 0.548780 0.451220
0.7-0.75 80 87 167 0.479042 0.520958
0.75-0.8 60 82 142 0.422535 0.577465
0.8-0.85 62 84 146 0.424658 0.575342
0.85-0.9 11 46 57 0.192982 0.807018
0.9-0.95 1 5 6 0.166667 0.833333
0.95-1.0 0 0 0 0.000000 0.000000
In [55]:
pf.get_ddf_table(ddf).round(4).copy()
Out[55]:
区间好 区间坏 区间总人数 区间好占比 区间坏占比 累计总人数 累计人数占比 累计好占比 累计坏占比 当前区间坏占比 KS 随机 提升(同区间) 提升(累计) 总体坏比例下降到 策略价值
0.0-0.05 97 0 97 1.0000 0.0000 97 0.0347 0.0450 0.0000 0.0000 -0.0450 0.2282 -1.0000 0.0000 0.2282 NaN
0.05-0.1 199 0 199 1.0000 0.0000 296 0.1060 0.1374 0.0000 0.0000 -0.1374 0.2282 -1.0000 0.0000 0.2282 NaN
0.1-0.15 235 2 237 0.9916 0.0084 533 0.1909 0.2464 0.0031 0.0038 -0.2433 0.2282 -0.9836 0.0209 0.2274 -0.7692
0.15-0.2 171 4 175 0.9771 0.0229 708 0.2536 0.3258 0.0094 0.0085 -0.3163 0.2282 -0.9629 0.0471 0.2260 -0.9782
0.2-0.25 184 9 193 0.9534 0.0466 901 0.3227 0.4111 0.0235 0.0166 -0.3876 0.2282 -0.9270 0.0942 0.2228 -1.1913
0.25-0.3 172 18 190 0.9053 0.0947 1091 0.3908 0.4910 0.0518 0.0302 -0.4391 0.2282 -0.8674 0.1727 0.2163 -1.3778
0.3-0.35 164 19 183 0.8962 0.1038 1274 0.4563 0.5671 0.0816 0.0408 -0.4854 0.2282 -0.8211 0.2332 0.2095 -1.4033
0.35-0.4 127 35 162 0.7840 0.2160 1436 0.5143 0.6260 0.1366 0.0606 -0.4894 0.2282 -0.7345 0.3414 0.1970 -1.4206
0.4-0.45 109 21 130 0.8385 0.1615 1566 0.5609 0.6766 0.1695 0.0690 -0.5070 0.2282 -0.6977 0.3768 0.1895 -1.3366
0.45-0.5 121 23 144 0.8403 0.1597 1710 0.6125 0.7327 0.2057 0.0766 -0.5271 0.2282 -0.6642 0.4113 0.1812 -1.2612
0.5-0.55 99 29 128 0.7734 0.2266 1838 0.6583 0.7787 0.2512 0.0871 -0.5275 0.2282 -0.6185 0.4567 0.1708 -1.1798
0.55-0.6 89 47 136 0.6544 0.3456 1974 0.7070 0.8200 0.3250 0.1049 -0.4950 0.2282 -0.5404 0.5416 0.1540 -1.0749
0.6-0.65 84 52 136 0.6176 0.3824 2110 0.7557 0.8589 0.4066 0.1227 -0.4523 0.2282 -0.4620 0.6255 0.1354 -0.9488
0.65-0.7 90 74 164 0.5488 0.4512 2274 0.8145 0.9007 0.5228 0.1464 -0.3779 0.2282 -0.3582 0.7468 0.1089 -0.7745
0.7-0.75 80 87 167 0.4790 0.5210 2441 0.8743 0.9378 0.6593 0.1721 -0.2785 0.2282 -0.2459 0.8791 0.0777 -0.5573
0.75-0.8 60 82 142 0.4225 0.5775 2583 0.9251 0.9657 0.7881 0.1943 -0.1776 0.2282 -0.1482 0.9851 0.0484 -0.3442
0.8-0.85 62 84 146 0.4247 0.5753 2729 0.9774 0.9944 0.9199 0.2147 -0.0745 0.2282 -0.0588 1.0823 0.0183 -0.1390
0.85-0.9 11 46 57 0.1930 0.8070 2786 0.9979 0.9995 0.9922 0.2268 -0.0074 0.2282 -0.0057 1.1024 0.0018 -0.0132
0.9-0.95 1 5 6 0.1667 0.8333 2792 1.0000 1.0000 1.0000 0.2282 0.0000 0.2282 0.0000 1.0526 0.0000 0.0000
0.95-1.0 0 0 0 0.0000 0.0000 2792 1.0000 1.0000 1.0000 0.2282 0.0000 0.2282 0.0000 1.0000 0.0000 0.0000

等频排序性¶

In [56]:
def ww_get_cutoff(df, target_col, bins_col, if_plot=True):
    """
    传入数据,标签列名,区间列名
    """
    cutoff = pd.DataFrame()
    cutoff['区间好'] = df[df[target_col] == 0].groupby(bins_col).count()[
        target_col].sort_index(ascending=False)
    cutoff['区间坏'] = df[df[target_col] == 1].groupby(bins_col).count()[
        target_col]
    cutoff['区间总人数'] = df.groupby(bins_col).count()[target_col]
    cutoff['区间好占比'] = cutoff['区间好'] / cutoff['区间总人数']
    cutoff['区间坏占比'] = cutoff['区间坏'] / cutoff['区间总人数']
    cutoff['区间人数占比'] = cutoff['区间总人数'] / cutoff['区间总人数'].sum()
    cutoff['累计总人数'] = cutoff['区间总人数'].cumsum()
    cutoff['累计人数占比'] = cutoff['累计总人数'] / cutoff['区间总人数'].sum()
    cutoff['累计好占比'] = cutoff['区间好'].cumsum() / cutoff['区间好'].sum()
    cutoff['累计坏占比(recall)'] = cutoff['区间坏'].cumsum() / cutoff['区间坏'].sum()
    cutoff['当前区间坏占比(precision)'] = cutoff['区间坏'].cumsum() / \
        cutoff['区间总人数'].cumsum()
    cutoff['KS'] = cutoff['累计坏占比(recall)'] - cutoff['累计好占比']
    cutoff['随机'] = cutoff['区间坏'].sum() / cutoff['区间总人数'].sum()
    cutoff['提升'] = cutoff['当前区间坏占比(precision)'] / cutoff['随机'] - 1
    cutoff['总体坏比例下降到'] = (
        (cutoff['区间坏'].sum() - cutoff['区间坏'].cumsum()) / cutoff['区间总人数'].sum()
    )
    cutoff = cutoff.reset_index()

    ddf = pd.DataFrame()
    ddf[['bins', 'good', 'bad', 'ALL', 'rate_good', 'rate_bad']
        ] = cutoff[['bins', '区间好', '区间坏', '区间总人数', '区间好占比', '区间坏占比']]
    if if_plot:
        plt.figure(figsize=(10, 5))
        good_bar = plt.bar(x=cutoff['bins'].astype(
            str)[::-1], height=cutoff['区间好占比'][::-1], align='edge', width=-0.3)
        bad_bar = plt.bar(x=cutoff['bins'].astype(
            str)[::-1], height=cutoff['区间坏占比'][::-1], align='edge', width=0.3)
        plt.xlabel('Proba Bins')
        plt.ylabel('Ratio')
        plt.title('Distribution By Labels', fontsize=20)
        plt.legend(handles=[good_bar, bad_bar], labels=[
                   "rate_good", "rate_bad"], loc="upper right", fontsize=14)
        plt.xticks(rotation=60)
        plt.show()
    return cutoff, ddf
In [57]:
tmp = X_merged.copy()
tmp['target'] = y_merged.values.copy()
tmp['proba'] = model.predict_proba(X_merged)[:,1]  # 取预测为1的概率
tmp['bins'] = pd.qcut(tmp['proba'], q=10)
cutoff, ddf = ww_get_cutoff(tmp, 'target', 'bins')
ddf
Out[57]:
bins good bad ALL rate_good rate_bad
0 (0.779, 0.906] 105 175 280 0.375000 0.625000
1 (0.691, 0.779] 137 142 279 0.491039 0.508961
2 (0.591, 0.691] 159 120 279 0.569892 0.430108
3 (0.485, 0.591] 202 77 279 0.724014 0.275986
4 (0.387, 0.485] 233 46 279 0.835125 0.164875
5 (0.306, 0.387] 237 42 279 0.849462 0.150538
6 (0.235, 0.306] 257 22 279 0.921147 0.078853
7 (0.156, 0.235] 269 10 279 0.964158 0.035842
8 (0.0964, 0.156] 272 3 275 0.989091 0.010909
9 (0.0209, 0.0964] 284 0 284 1.000000 0.000000
In [58]:
cutoff
Out[58]:
bins 区间好 区间坏 区间总人数 区间好占比 区间坏占比 区间人数占比 累计总人数 累计人数占比 累计好占比 累计坏占比(recall) 当前区间坏占比(precision) KS 随机 提升 总体坏比例下降到
0 (0.779, 0.906] 105 175 280 0.375000 0.625000 0.100287 280 0.100287 0.048724 0.274725 0.625000 0.226001 0.228152 1.739403 0.165473
1 (0.691, 0.779] 137 142 279 0.491039 0.508961 0.099928 559 0.200215 0.112297 0.497645 0.567084 0.385348 0.228152 1.485555 0.114613
2 (0.591, 0.691] 159 120 279 0.569892 0.430108 0.099928 838 0.300143 0.186079 0.686028 0.521480 0.499949 0.228152 1.285669 0.071633
3 (0.485, 0.591] 202 77 279 0.724014 0.275986 0.099928 1117 0.400072 0.279814 0.806907 0.460161 0.527093 0.228152 1.016907 0.044054
4 (0.387, 0.485] 233 46 279 0.835125 0.164875 0.099928 1396 0.500000 0.387935 0.879121 0.401146 0.491186 0.228152 0.758242 0.027579
5 (0.306, 0.387] 237 42 279 0.849462 0.150538 0.099928 1675 0.599928 0.497912 0.945055 0.359403 0.447143 0.228152 0.575280 0.012536
6 (0.235, 0.306] 257 22 279 0.921147 0.078853 0.099928 1954 0.699857 0.617169 0.979592 0.319345 0.362422 0.228152 0.399703 0.004656
7 (0.156, 0.235] 269 10 279 0.964158 0.035842 0.099928 2233 0.799785 0.741995 0.995290 0.283923 0.253295 0.228152 0.244447 0.001074
8 (0.0964, 0.156] 272 3 275 0.989091 0.010909 0.098496 2508 0.898281 0.868213 1.000000 0.253987 0.131787 0.228152 0.113238 0.000000
9 (0.0209, 0.0964] 284 0 284 1.000000 0.000000 0.101719 2792 1.000000 1.000000 1.000000 0.228152 0.000000 0.228152 0.000000 0.000000

模型分分布¶

In [59]:
plt.figure(figsize=(10, 7))
tmp = pd.concat([data_df, testSet], axis=0)  # 注意,这里用了所有数据
for year in [2018, 2019, 2020]:
    y_test_prob = model.predict_proba(tmp.loc[tmp['year']==year, selected_features])
    sns.distplot(y_test_prob[:, 1], label=year)
plt.legend()
plt.xlim((0, 1))
plt.xlabel('model_score')
plt.show()
In [60]:
all_probs={
    'good': model.predict_proba(
        testSet.loc[testSet['target'] == 0, selected_features])[:, 1],
    'bad': model.predict_proba(
        testSet.loc[testSet['target'] == 1, selected_features])[:, 1],
}
pf.draw_score_distribution(all_probs, split_score=0.3, xlim=(0, 1))
None
('mean0 - mean1 = ', -0.29807889605519566)
0.3分以下的坏用户占坏用户总数的: 5.181%
0.3分以下的好用户占好用户总数的: 49.095%
0.3分以下好坏客户比= 32.061

0.3分以上的坏用户占坏用户总数的: 94.819%
0.3分以上的好用户占好用户总数的: 50.905%
0.3分以上好坏客户比= 1.816

In [61]:
tmp = pd.concat([data_df, testSet], axis=0)
for year in [2018, 2019, 2020]:
    y_test_prob = tmp.loc[tmp['year']==year, 'next_year_ipd']
    sns.distplot(y_test_prob, label=year)
plt.legend()
plt.show()

PSI¶

In [62]:
def score_func(x):
    return x

targetV = model.predict_proba(X_validation_selected)
targetT = model.predict_proba(X_training_selected)

all_probs = {
    'Validation': score_func(targetV[:, 1]),
    'Traning': score_func(targetT[:, 1])   
}

PSI = pf.get_PSI_score(all_probs, is_return=True)
PSI score: 0.020767
In [63]:
targetV = model.predict_proba(X_validation_selected)
targetT = model.predict_proba(X_testing[selected_features])

all_probs = {
    'Validation': score_func(targetV[:, 1]),
    'Testing': score_func(targetT[:, 1])   
}

PSI = pf.get_PSI_score(all_probs, is_return=True)
PSI score: 0.025071

KS & ROC & AUC¶

In [64]:
y_test_prob = model.predict_proba(X_testing[selected_features])
ksdf = pf.draw_KS_line(y_test_prob[:, 0], y_testing, n=20,  # 用0列
                fig_save=False, is_return=True, 
                mode='plt')

模型保存¶

In [65]:
# 保存模型与对应features的importance

# date = time.strftime('%Y%m%d-%H%M%S', time.localtime(time.time()+8*3600))
# message = f'城投中债隐含评级>=6_20220929'
# joblib.dump(model, f'./20220929任务结果/{date}_GBDT_{message}.pkl')
# model_feature_importance.to_csv(
#     f'./20220929任务结果/{date}_feature_importance_{message}.csv',
#     index=False
# )
# print(f'./pkls/{date}_GBDT_{message}.pkl')
# print(model.feature_names_)

单变量分析¶

分布与坏占比分析¶

In [66]:
def bin_split(data, col, target, cut_num, hue=None, is_discrete=False):
    '''单变量等频分箱分析
    PS: 最后一箱的分割点为左闭右闭区间
    Parameters
    ----------
    data : dataFrame
        需要单变量分析的数据表

    col : string
        需要进行单变量分析的特征名字

    target : string
        计算坏客户占比所需要的目标特征名字,1是坏;0是好

    cut_num : int
        特征等频分箱的分箱数量

    hue : string
        需要进行分组的特征名

    is_discrete : bool
        col特征是否为离散型变量

    Return
    ----------
    return groupby_df
        返回分组分析的结果表
    '''

    def intervel_to_str(a):
        if a.left < 0:
            left_num = str(int(round(a.left, 0)))
        elif a.left > 10:
            left_num = str(int(round(a.left, 0)))
        else:
            left_num = str(a.left)

        if a.right > 10:
            right_num = str(int(round(a.right, 0)))
        else:
            right_num = str(a.right)
        return f'[{left_num},{right_num})'

    values = data[col]
    cut_df = pd.DataFrame()
    if ((values.dtype == int) or
        (values.dtype == float)) and (is_discrete is False):
        tmp_split = values[values >= 0].tolist()
        tmp_split = [0] if tmp_split == [] else tmp_split
        cut_list = list(
            set([
                -9999999,
                min(values), 0,
                max(tmp_split) + 1e-9,
                min(tmp_split)
            ]))
        for i in range(1, cut_num):
            cut_list = list(
                set(cut_list + [np.percentile(tmp_split, 100 / cut_num * i)]))
        cut_list.sort()
        cut_df['cut'] = pd.cut(
            values, cut_list, include_lowest=True, right=False)
    else:
        cut_df['cut'] = data[col]
    cut_df[target] = data[target]
    if hue is not None:
        cut_df['hue'] = data[hue]
    else:
        cut_df['hue'] = -999

    hues = cut_df['hue'].unique().tolist()
    if len(hues) > 1:
        groupby_dfs = []
        for h in hues:
            groupby_df = pd.DataFrame()
            groups = cut_df[cut_df['hue'] == h].groupby('cut')
            groupby_df['count'] = groups.count()[target]
            groupby_df['sum'] = groups.sum()[target]
            groupby_df['bad_rate'] = groupby_df['sum'] / groupby_df['count']
            groupby_df['type'] = h
            groupby_df[
                'ratio'] = groupby_df['count'] / groupby_df['count'].sum()
            groupby_dfs.append(groupby_df)

        groupby_df = pd.concat(groupby_dfs, axis=0)
    else:
        groupby_df = pd.DataFrame()
        groups = cut_df.groupby('cut')
        groupby_df['count'] = groups.count()[target]
        groupby_df['sum'] = groups.sum()[target]
        groupby_df['bad_rate'] = groupby_df['sum'] / groupby_df['count']
        groupby_df['ratio'] = groupby_df['count'] / groupby_df['count'].sum()
    groupby_df = groupby_df.reset_index()
    if is_discrete is False:
        groupby_df['cut'] = groupby_df['cut'].apply(intervel_to_str)
    groupby_df = groupby_df[groupby_df['ratio'] > 0]
    groupby_df['cut'] = groupby_df['cut'].tolist()

    plt.figure(figsize=(24, 8))
    plt.subplot(1, 2, 1)
    plt.title(
        f'bad rate of {col}', fontdict={'size': 20}, fontproperties=myfont)
    if len(hues) > 1:
        ax = sns.barplot(
            x='cut',
            y='bad_rate',
            data=groupby_df,
            hue='type',
            saturation=0.5,
            ci=0.2,
            capsize=5)
        plt.setp(ax.get_legend().get_texts(), fontsize='15')
        plt.setp(ax.get_legend().get_title(), fontsize='15')
    else:
        ax = sns.barplot(
            x='cut',
            y='bad_rate',
            data=groupby_df,
            saturation=0.5,
            ci=0.2,
            capsize=5)
    plt.setp(ax.get_xaxis().get_label(), fontsize='20')
    plt.setp(ax.get_yaxis().get_label(), fontsize='20')
    plt.setp(ax.get_xaxis().get_ticklabels(), fontsize='15')
    plt.setp(ax.get_yaxis().get_ticklabels(), fontsize='15')
    plt.ylim((max(groupby_df['bad_rate'].min() - 1e-5, 0),
              groupby_df['bad_rate'].max() + 1e-5))
    plt.xticks(rotation=90, fontproperties=myfont)

    plt.subplot(1, 2, 2)
    plt.title(f'ratio of {col}', fontdict={'size': 20}, fontproperties=myfont)
    if len(hues) > 1:
        ax = sns.barplot(
            x='cut',
            y='ratio',
            data=groupby_df,
            hue='type',
            saturation=0.5,
            ci=0.2)
        plt.setp(ax.get_legend().get_texts(), fontsize='15')
        plt.setp(ax.get_legend().get_title(), fontsize='15')
    else:
        ax = sns.barplot(
            x='cut', y='ratio', data=groupby_df, saturation=0.5, ci=0.2)
    plt.setp(ax.get_xaxis().get_label(), fontsize='20')
    plt.setp(ax.get_yaxis().get_label(), fontsize='20')
    plt.setp(ax.get_xaxis().get_ticklabels(), fontsize='15')
    plt.setp(ax.get_yaxis().get_ticklabels(), fontsize='15')
    plt.xticks(rotation=90, fontproperties=myfont)
    return groupby_df

变量分箱图¶

In [67]:
tmp_df1 = X_training.copy()
tmp_df1['target'] = y_training
tmp_df1['dataset_type'] = 'train_val'
tmp_df2 = X_validation.copy()
tmp_df2['target'] = y_validation
tmp_df2['dataset_type'] = 'train_val'
tmp_df3 = X_testing.copy()
tmp_df3['target'] = y_testing
tmp_df3['dataset_type'] = 'test'
single_analysis_df = pd.concat([tmp_df2, tmp_df3], axis=0)

for i, col in tn(enumerate(model_feature_importance['features']),
                 total=model_feature_importance.shape[0]):
# for i, col in tn(enumerate(single_analysis_df.columns[1:-2]), 
#                  total=len(single_analysis_df.columns[1:-2])):
    print(i+1, col)
    
    if len(single_analysis_df[col].unique()) <= 7:
        bin_df = bin_split(data=single_analysis_df, col=col, target='target',
                  cut_num=4, hue='dataset_type', is_discrete=True)
    else:
        bin_df = bin_split(data=single_analysis_df, col=col, target='target',
                  cut_num=4, hue='dataset_type', is_discrete=False)
        
#     bin_df['is_-9999999'] = bin_df['cut'].map(lambda x: 1 if '-9999999' in str(x) else 0)
#     num_ok = 0
#     for idx, t in enumerate(bin_df['type'].unique()):
#         bbb = bin_df[(bin_df['type'] == t) & ((bin_df['is_-9999999'] == 0))]
#         bbb_ratio = bbb['bad_rate'].tolist()
#         if ((bbb_ratio == sorted(bbb_ratio)) or
#             (bbb_ratio == sorted(bbb_ratio, reverse=True))
#            ):
#             num_ok += 1
#     if num_ok == idx + 1:
#         save_path = './savefig/单调/'
#     else:
#         save_path = './savefig/非单调/'

#     save_path = './20220929任务结果/savefig/'
#     if os.path.isdir(save_path) is False:
#         os.makedirs(save_path)
    #plt.savefig(f"{save_path}{i+1}|{col.split('_')[1].replace('/', '|')}|{col.split('_')[0]}.png", bbox_inches='tight')
  0%|          | 0/30 [00:00<?, ?it/s]
1 成交_成交土地均价(元/㎡)
2 资产规模数据_资产总计
3 区域财政_政府负债率(%)
4 财报衍生_净资产(即所有者权益)
5 财报衍生_其他应收款/总资产
6 区域财政_财政自给率(%)
7 区域财政_地方政府债务率(%)
8 财报衍生_固定资产
9 区域贷款_区域银行贷款余额(亿元)
10 区域利差_1年期不限评级
11 区域财政_一般预算收入占比(%)
12 财报衍生_其他应付款/流动负债
13 财报衍生_净利润
14 财报衍生_流动资产
15 偿债能力_现金比率
16 区域财政_基金收入占比(%)
17 区域经济_生产总值(GDP)(元)
18 偿债能力_货币资金/短期债务
19 区域财政_存续城投债务规模/一般预算收入(%)
20 区域财政_财政收入(元)
21 偿债能力_有形净值债务率
22 成长能力_归属母公司股东的权益(相对年初增长率)
23 财报衍生_其他应付款/流动资产
24 区域财政_一般预算收入增长
25 每股指标_每股现金流量净额
26 财报衍生_无形资产
27 财报衍生_存货周转天数
28 成长能力_净利润复合年增长率
29 成长能力_营业利润(N年,增长率)
30 区域经济_生产总值同比增长(%)
In [ ]:
 

特征性能分析¶

In [68]:
# 模型FI
tmp = pd.concat([data_df, testSet], axis=0)
iv_df, woe_df = pf.iv_woe(tmp[['target'] + selected_features], 'target')
vif = pf.check_vif(tmp[selected_features])

KSs = []
AUCs = []
imp_df = model_feature_importance.copy()  # 注意,这里把最终模型的feature_importance拷贝过来了

# 对模型用到的特征,依次计算 AUC 和 KS
for col in tn(imp_df['features']):
    feature_value = testSet[col]
    auc = roc_auc_score(testSet['target'], feature_value)
    fpr, tpr, _ = roc_curve(testSet['target'], feature_value)
    AUCs.append(1 - auc if auc < 0.5 else auc)
    KSs.append(max(abs(fpr - tpr)))
imp_df['KS'] = KSs
imp_df['AUC'] = AUCs
imp_df = pd.merge(iv_df, imp_df, on='features')
imp_df = pd.merge(imp_df, vif, on='features')
imp_df = imp_df.sort_values('importance', ascending=False)
imp_df
iv_woe:   0%|          | 0/30 [00:00<?, ?it/s]
vif:   0%|          | 0/30 [00:00<?, ?it/s]
123671859.64957677
  0%|          | 0/30 [00:00<?, ?it/s]
Out[68]:
features iv importance KS AUC VIF
0 成交_成交土地均价(元/㎡) 1.015336 0.184342 0.442876 0.765070 1.048077e+00
8 资产规模数据_资产总计 0.440099 0.175929 0.241275 0.643716 1.944197e+01
10 区域财政_政府负债率(%) 0.376737 0.077259 0.352890 0.701944 1.236717e+08
11 财报衍生_净资产(即所有者权益) 0.298295 0.061899 0.206440 0.609599 9.278368e+00
19 财报衍生_其他应收款/总资产 0.092960 0.048705 0.132413 0.571922 3.470148e+01
1 区域财政_财政自给率(%) 0.935645 0.047808 0.399876 0.759827 2.124382e+00
12 区域财政_地方政府债务率(%) 0.272864 0.035382 0.319231 0.687272 1.236719e+08
15 财报衍生_固定资产 0.225307 0.031424 0.169627 0.590696 3.136555e+00
2 区域贷款_区域银行贷款余额(亿元) 0.661162 0.029288 0.377001 0.721983 1.041962e+00
5 区域利差_1年期不限评级 0.588315 0.028756 0.230495 0.520084 1.170056e+00
7 区域财政_一般预算收入占比(%) 0.482476 0.028568 0.334062 0.705093 1.556958e+05
13 财报衍生_其他应付款/流动负债 0.226599 0.025955 0.188717 0.612887 7.567243e+01
16 财报衍生_净利润 0.219595 0.022245 0.160474 0.568054 1.417038e+00
14 财报衍生_流动资产 0.226576 0.022165 0.174274 0.592598 3.913026e+00
18 偿债能力_现金比率 0.110320 0.020235 0.200740 0.619950 1.210540e+07
9 区域财政_基金收入占比(%) 0.431765 0.017771 0.309033 0.687950 1.072113e+00
3 区域经济_生产总值(GDP)(元) 0.610885 0.016144 0.355763 0.720621 1.979093e+01
26 偿债能力_货币资金/短期债务 0.064087 0.015777 0.147355 0.574601 7.016615e+00
28 区域财政_存续城投债务规模/一般预算收入(%) 0.024625 0.015196 0.086347 0.502860 1.557140e+05
4 区域财政_财政收入(元) 0.589591 0.014360 0.352329 0.712676 1.950872e+01
17 偿债能力_有形净值债务率 0.147026 0.013088 0.144189 0.588595 1.210582e+07
22 成长能力_归属母公司股东的权益(相对年初增长率) 0.080384 0.011857 0.149205 0.563867 7.979769e+01
29 财报衍生_其他应付款/流动资产 0.018235 0.010449 0.094744 0.546551 5.296924e+01
6 区域财政_一般预算收入增长 0.503313 0.009763 0.303605 0.647989 1.025558e+00
25 每股指标_每股现金流量净额 0.071257 0.008619 0.119301 0.551559 6.866977e+01
21 财报衍生_无形资产 0.089704 0.007537 0.111493 0.555340 1.771541e+00
24 财报衍生_存货周转天数 0.077135 0.005747 0.142353 0.569929 1.032172e+00
23 成长能力_净利润复合年增长率 0.077780 0.005468 0.115107 0.521688 3.605544e+02
27 成长能力_营业利润(N年,增长率) 0.037175 0.004922 0.059886 0.506743 3.591344e+02
20 区域经济_生产总值同比增长(%) 0.089911 0.003342 0.110915 0.541977 1.024103e+00

相关任务-模型跑批投产¶

In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]: